Recipe Rating Analysis: Nutritional Values and Ingredients
1 Data Description and Cleaning
1.1 Loading data
Here we load the dataset and do some cleaning and processing. We standardise the variables names and add an ID column to have a unique identifier for each recipe.
#loading the data
recipes_raw <- read.csv(here("data/epi_r.csv"))
recipes <- recipes_raw%>%
clean_names() %>%
mutate(ID = 1:nrow(.)) %>%
select(ID, everything())1.2 Data Description
tibble(Variables = c("**ID**", "**title**", "**rating**", "**calories**", "**protein**", "**fat**", "**sodium**", "**674 other binary variables**"), Meaning = c("Unique ID", "Recipe name", "Rating of the recipe", "Calories contained in the recipe", "Protein contained in the recipe (grams)","Fat contained in the recipe (grams)", "Sodium contained in the recipe (milligrams)", "The rest of the data is made of many binary variables, incl. ingredients, types of recipes, US States, diet preferences, etc."))%>%
kbl()%>%
kable_styling(position = "center")| Variables | Meaning |
|---|---|
| ID | Unique ID |
| title | Recipe name |
| rating | Rating of the recipe |
| calories | Calories contained in the recipe |
| protein | Protein contained in the recipe (grams) |
| fat | Fat contained in the recipe (grams) |
| sodium | Sodium contained in the recipe (milligrams) |
| 674 other binary variables | The rest of the data is made of many binary variables, incl. ingredients, types of recipes, US States, diet preferences, etc. |
1.3 Classifying variables into categories
Given the high amount of variables that we had (680), we decided that we needed to somewhat create categories to aggregate them and be able to use them more easily.
Specifically, here are the things we had to solve when creating categories:
- Merge “father_s_day” and “fathers_day” same for mother’s day and for new year’s and st patrick and valentines day, vermout and vermouth - DONE
- Check what leafy_green is and how many obs there is of it
- Same for “legume”
- Decide if we group beans together or leave them in vegetables - DONE
- What do we do with “meat”, meatball and meatloaf. What about rabbit and for sausage or steak and venison - DONE
- Put nutmeg in spices or nuts? - DONE
- do we create a seeds category –> for example for “poppy” (put in spices for now) and for “seed” and sesame - DONE
- We should probably create a sauce category, and a “full_meal”
- where do we put tapioca and yuca - DONE
- do we put buttermilk in drinks or dairy?
- check if dorie_greenspa column exist or if it was just a typo without the N
- check how many observations with phyllo_puff_pastry_dough
- do we separate fish and seafood?
We have now solved all these issues and manually classified every variable of our dataset in specific categories shown below.
us_states <- c("alabama", "alaska", "arizona", "california", "colorado", "connecticut", "florida", "georgia", "hawaii", "idaho", "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland", "massachusetts", "michigan", "minnesota", "mississippi", "missouri", "nebraska", "new_hampshire", "new_jersey", "new_mexico", "new_york", "north_carolina", "ohio", "oklahoma", "oregon", "pennsylvania", "rhode_island", "south_carolina", "tennessee", "texas", "utah", "vermont", "virginia", "washington", "west_virginia", "wisconsin")
us_cities <- c("aspen", "atlanta", "beverly_hills", "boston","brooklyn", "buffalo", "cambridge", "chicago", "columbus", "costa_mesa", "dallas", "denver", "healdsburg", "hollywood", "houston", "kansas_city", "lancaster", "las_vegas", "london", "los_angeles", "louisville", "miami", "minneapolis", "new_orleans", "pacific_palisades", "paris", "pasadena", "pittsburgh", "portland", "providence", "san_francisco", "santa_monica", "seattle", "st_louis", "washington_d_c", "westwood", "yonkers")
countries <- c("australia", "bulgaria", "canada", "chile", "cuba", "dominican_republic", "egypt", "england", "france", "germany", "guam", "haiti", "ireland", "israel", "italy", "jamaica", "japan", "mexico", "mezcal", "peru", "philippines", "spain", "switzerland")
alcohol <- c("alcoholic", "amaretto", "beer", "bitters", "bourbon", "brandy", "calvados", "campari", "chambord", "champagne", "chartreuse", "cocktail", "cognac_armagnac", "creme_de_cacao", "digestif", "eau_de_vie", "fortified_wine", "frangelico", "gin", "grand_marnier", "grappa", "kahlua", "kirsch", "liqueur", "long_beach", "margarita", "marsala", "martini", "midori", "pernod", "port", "punch", "red_wine", "rum", "sake", "sangria", "scotch", "sherry", "sparkling_wine", "spirit", "spritzer", "tequila", "triple_sec", "vermouth", "vodka", "whiskey", "white_wine", "wine")
wtf <- c("bon_appetit", "bon_app_tit", "condiment_spread", "cr_me_de_cacao", "epi_ushg", "flaming_hot_summer", "frankenrecipe", "harpercollins", "house_garden", "no_meat_no_problem", "parade", "sandwich_theory", "self", "shower", "tested_improved", "windsor", "weelicious", "snack_week", "tailgating", "quick_and_healthy", "picnic", "kitchen_olympics", "house_cocktail", "hors_d_oeuvre", "frozen_dessert", "freezer_food", "edible_gift", "cookbook_critic", "cook_like_a_diner", "condiment", "cocktail_party", "camping", "buffet", "x30_days_of_groceries", "x_cakeweek", "x_wasteless", "x22_minute_meals", "x3_ingredient_recipes")
chef <- c("anthony_bourdain", "dorie_greenspan", "emeril_lagasse", "nancy_silverton", "suzanne_goin")
interesting <- c("advance_prep_required", "entertaining", "epi_loves_the_microwave", "friendsgiving", "game", "gourmet", "healthy", "high_fiber", "hot_drink", "kid_friendly", "kidney_friendly", "low_cal", "low_cholesterol", "microwave", "no_cook", "one_pot_meal", "oscars", "paleo", "pescatarian", "poker_game_night", "potluck", "quick_easy", "cookbooks", "leftovers")
seasons_vec <- c("winter", "spring", "summer", "fall")
celebrations <- c("anniversary", "back_to_school", "bastille_day", "birthday", "christmas", "christmas_eve", "cinco_de_mayo", "date", "diwali", "easter", "engagement_party", "family_reunion", "father_s_day", "fourth_of_july", "graduation", "halloween", "hanukkah", "kentucky_derby", "kwanzaa", "labor_day", "lunar_new_year", "mardi_gras", "mother_s_day", "new_year_s_day", "new_year_s_eve", "oktoberfest", "party", "passover", "persian_new_year", "purim", "ramadan", "rosh_hashanah_yom_kippur", "shavuot", "st_patrick_s_day", "sukkot", "super_bowl", "thanksgiving", "valentine_s_day", "wedding")
drink_no_alcohol_vec <- c("apple_juice", "fruit_juice", "iced_tea", "lemon_juice", "lime_juice", "orange_juice", "pomegranate_juice", "tea")
tools <- c("coffee_grinder", "double_boiler", "food_processor", "ice_cream_machine", "juicer", "mandoline", "mixer", "mortar_and_pestle", "pasta_maker", "ramekin", "skewer", "slow_cooker", "smoker", "wok", "blender", "candy_thermometer", "pressure_cooker")
cooking_techniques <- c("raw", "saute", "freeze_chill", "fry", "stir_fry", "simmer", "boil", "broil", "bake", "braise", "chill", "deep_fry", "steam", "rub", "roast", "poach", "pan_fry", "marinate", "grill_barbecue", "grill")
nutritional_values <- c("calories", "protein", "fat", "sodium")
recipe_type_vec <- c("aperitif", "appetizer", "breakfast", "brunch", "dessert", "dinner", "lunch", "side", "snack")
diet_preferences_vec <- c( "dairy_free", "fat_free", "kosher","kosher_for_passover", "low_carb", "low_fat", "low_sodium", "low_sugar", "low_no_sugar", "non_alcoholic", "no_sugar_added", "organic", "peanut_free", "soy_free", "sugar_conscious", "tree_nut_free", "vegan", "vegetarian", "wheat_gluten_free")
### Ingredients
#low level categories
vegetables_vec <- c("artichoke", "arugula", "asparagus", "butternut_squash", "bean", "beet", "bell_pepper", "bok_choy", "broccoli", "broccoli_rabe", "brussel_sprout", "cabbage", "capers", "carrot", "cauliflower", "celery", "chard", "chile_pepper", "collard_greens", "corn", "cucumber", "eggplant", "endive", "escarole", "fennel", "garlic", "ginger", "green_bean", "green_onion_scallion", "horseradish", "jerusalem_artichoke", "jicama", "kale", "leafy_green", "leek", "legume", "lentil", "lettuce", "lima_bean", "mushroom", "mustard_greens", "okra", "onion", "parsnip", "pea", "pickles", "poblano", "pumpkin", "radicchio", "radish", "root_vegetable", "rutabaga", "salad", "shallot", "soy", "spinach", "squash", "sugar_snap_pea", "tapioca", "tomatillo", "tomato", "turnip", "watercress", "yellow_squash", "yuca", "zucchini")
pork_meat_vec <- c("bacon", "ham", "pork", "pork_chop", "pork_rib", "pork_tenderloin", "prosciutto")
lamb_meat_vec <- c("ground_lamb", "lamb", "lamb_chop", "lamb_shank", "rack_of_lamb")
beef_meat_vec <- c("beef", "beef_rib", "beef_shank", "beef_tenderloin", "brisket", "ground_beef", "hamburger", "veal")
meat_with_wings_vec <- c("chicken", "duck", "goose", "poultry", "poultry_sausage", "quail", "turkey")
meat_various_vec <- c("meatball", "meatloaf", "rabbit", "sausage", "steak", "venison")
# stuff_in_the_water <- c("anchovy", "bass", "caviar", "clam", "cod", "crab", "fish", "halibut", "lobster", "mussel", "octopus", "oyster", "salmon", "sardine", "scallop", "seafood", "shellfish", "shrimp", "snapper", "squid", "swordfish", "tilapia", "trout", "tuna")
seafood_vec <- c("clam", "crab", "lobster", "mussel", "octopus", "oyster", "scallop", "shellfish", "shrimp", "squid")
fish_vec <- c("anchovy", "bass", "caviar", "cod", "halibut", "salmon", "sardine", "snapper", "swordfish", "tilapia", "trout", "tuna")
herbs_vec <- c("anise", "basil", "chive", "cilantro", "coriander", "dill", "lemongrass", "mint", "oregano", "parsley", "rosemary", "sage", "tarragon", "thyme")
nuts_vec <- c("almond", "cashew", "chestnut", "hazelnut", "macadamia_nut", "peanut", "pecan", "pine_nut", "pistachio", "tree_nut", "walnut")
cereals_vec <- c("barley", "bran", "bulgur", "grains", "granola", "oat", "quinoa", "rye", "whole_wheat")
carbs_vec <- c("brown_rice", "chickpea", "cornmeal", "couscous", "hominy_cornmeal_masa", "orzo", "pasta", "potato", "rice", "semolina", "sweet_potato_yam", "wild_rice")
fruits_vec <- c("apple", "apricot", "asian_pear", "avocado", "banana", "berry", "blackberry", "blueberry", "cantaloupe", "cherry", "citrus", "coconut", "cranberry", "currant", "dried_fruit", "fig", "grape", "grapefruit", "guava", "honeydew", "kiwi", "kumquat", "lemon", "lime", "lingonberry", "lychee", "mango", "melon", "nectarine", "olive", "orange", "papaya", "passion_fruit", "peach", "pear", "persimmon", "pineapple", "plantain", "plum", "pomegranate", "prune", "quince", "raisin", "raspberry", "rhubarb", "strawberry", "tamarind", "tangerine", "tropical_fruit", "watermelon")
dessert_vec <- c("biscuit", "brownie", "butterscotch_caramel", "cake", "candy", "chocolate", "cobbler_crumble", "cookie", "cookies", "cranberry_sauce", "crepe", "cupcake", "honey", "jam_or_jelly", "maple_syrup", "marshmallow", "muffin","pancake", "pastry", "pie", "smoothie", "sorbet", "souffle_meringue", "waffle")
cheeses_vec <- c("blue_cheese", "brie", "cheddar", "cottage_cheese", "cream_cheese", "feta", "fontina", "goat_cheese", "gouda", "monterey_jack", "mozzarella", "parmesan", "ricotta", "swiss_cheese")
dairy_vec <- c("butter", "buttermilk", "custard", "egg_nog", "ice_cream", "marscarpone", "milk_cream", "sour_cream", "yogurt")
spices_vec <- c("caraway", "cardamom", "chili", "cinnamon", "clove", "cumin", "curry", "hot_pepper", "jalapeno", "marinade", "nutmeg", "paprika", "pepper", "poppy", "saffron", "sesame", "sesame_oil", "soy_sauce", "vanilla", "wasabi")
#top level categories
general_categories <- c("vegetable", "meat", "fish", "seafood", "herb", "nut", "fruit", "drink", "cheese", "dairy", "spice")#using this to select the columns in ingredients_df and we could also use it later of for the for loop
all_meats <- c(beef_meat_vec, pork_meat_vec, lamb_meat_vec, meat_with_wings_vec, meat_various_vec)
all_fish_seafood <- c(fish_vec, seafood_vec)
all_ingredients <- c(vegetables_vec, all_meats, all_fish_seafood, herbs_vec, nuts_vec, cereals_vec, carbs_vec, fruits_vec, drink_no_alcohol_vec, dessert_vec, cheeses_vec, dairy_vec, spices_vec, "egg")
#stuff which isn't ingredients and that we need to sort
to_sort <- c("backyard_bbq", "bread", "breadcrumbs", "brine", "burrito", "casserole_gratin", "coffee", "flat_bread", "hummus", "iced_coffee", "lasagna", "macaroni_and_cheese", "mayonnaise", "mustard", "noodle", "oatmeal", "omelet", "peanut_butter", "pizza", "pot_pie", "potato_salad", "quiche", "rose", "salad_dressing", "salsa", "sandwich", "sauce", "seed", "soup_stew", "stew", "stock", "stuffing_dressing", "taco", "tart", "tofu", "tortillas", "vinegar", "frittata", "molasses", "sourdough", "fritter", "phyllo_puff_pastry_dough", "dip")#whole list of stuff to remove for now to be able to sort
to_remove_temp <- c(us_states, us_cities, countries, alcohol, wtf, chef, interesting, season, celebrations, tools, cooking_techniques, nutritional_values, repice_type, diet_preferences, all_ingredients, to_sort)
#tried this with select but didn't work because some columns in the vector don't exist in the dataset
recipes_to_filter <- recipes[, !(colnames(recipes) %in% to_remove_temp)]
#creates a tibble with one column with all the colnames to be able to sort ingredients
names <- recipes_to_filter %>% colnames() %>% as_tibble()
# recipes %>%
# select(iceland)
# filter(x_cakeweek==1)
#checked if some values in our categories weren't columns in recipes --> 19 of them weren't so I deleted them from the vectors
checking <- tibble(to_remove_temp[!to_remove_temp %in% colnames(recipes)])
checking
#checking only for ingredients because I feel like there is one in the ingredient vector which is not a column name
checking_ing <- tibble(all_ingredients[!all_ingredients %in% colnames(recipes)])
checking_ing
#no name comes out so the only logical conclusion is that there is a duplicated ingrediant name --> it was arugula
all_ingredients[duplicated(all_ingredients)]2 Data Cleaning
2.1 Analysis of NAs
recipes_nutrition <- recipes %>%
select(title, calories, protein, fat, sodium)
na_obs <- which(rowSums(is.na(recipes_nutrition)) > 0)
# subset the original dataframe to only include rows with NA values
df_na <- recipes_nutrition[na_obs, ]
# print the result
#df_na
# count the number of NAs for each row
na_count <- rowSums(is.na(df_na))
# count the frequency of NA counts
freq_table_na <- table(na_count)
freq_na <- as.data.frame(freq_table_na) %>%
mutate(na_count = as.character(na_count))
freq_na %>%
ggplot(aes(x=na_count, y=Freq)) +
geom_bar(stat="identity") +
xlab("Number of NAs") +
ylab("Frequency") +
ggtitle("Number of NAs in nutritional values per recipe") +
coord_flip()# Among the recipes which have NAs, we notice that many of them have 4 NAs for all the 4 nutritional values, more precisely 4117 out of 4188 recipes. Without any other information available, making an imputation to retrieve such values would not make any sense.
# We could try to make an imputation of the 29 recipes that have only 1 NA. The same operation on the 42 recipes with 2 NAs would not deliver accurate and satisfying results. However, we believe that is not worth to make imputation of such NA values. We should not forget that the nutritional values per recipe are estimated, then making an imputation would result in a sort of estimation of an estimation. To what extent could it be reliable? We decide to eliminate recipes with NA values. Finally, we would still have 15864 recipes without NAs for our analysis.
recipes <- recipes %>%
drop_na()
#15864 obs after droping NAs2.2 Eliminate recipes with rating equal to zero
rating_count <- table(recipes$rating)
rating_count <- as.data.frame(rating_count) %>%
rename(rating = Var1,
frequency = Freq)
# There are 1293 recipes which have rating equal to zero. Some of those might be unpopular, others might be too recent to have a rating. For the purpose of our analysis, we decide to eliminate these specific recipes.
recipes <- recipes %>%
filter(rating != 0)
# We are left with 14568 observations after removing NAs and obs with a 0 rating value2.3 Discard copies of recipes
# We want to eliminate recipes that have multiple copies. Sometimes the recipes have the same title, but nutritional values are different. This indicates that there are various ways to prepare a specific recipe. We want to keep those recipes that have the same title, but have different nutritional values.
# Let's check for instance Almond Butter Crisps, a recipe which can be found twice in the data set, with ID=1026 and ID=8908.
# recipes %>%
# filter(ID == 1026)
unique_recipes <- distinct(recipes, title, rating, protein, sodium, fat, calories, .keep_all = TRUE)
# unique_recipes %>%
# filter(ID == 8908)
# Now the data set is free from useless copies. We discarded 1288 copies in total. #we lose a bit less if we remove the ones without specific ingredients first, meaning that some duplicate copies don't contain specific ingredients either
recipes <- unique_recipes2.4 Removing recipes without specific ingredients listed
Here we are facing a challenge regarding the general ingredient categories. Indeed, when doing computations on the binary columns, there is no issue since, whether the recipe contains specific ingredients in a category, or only a 1 in the general category, then that information is captured in the corresponding binary column.
However, if we want to compute the total number of ingredients in each category that is present in recipes, then we are facing problems. To illustrate, let’s assume that we have a recipe which contains 3 vegetables (specific columns in the vegetables_vec). In addition, for that recipe, the general column is also a 1 –> then by summing up, we get 4 ingredients when it should be 3.
Another problem is related to recipes for which the only column in a category (e.g., vegetables) that has a 1 is the general category (i.e., vegetable), and there isn’t any specific ingredient listed within the vegetable category (in vegetables_vec) –> this can lead to issues when counting the number of specific ingredients per category.
In order to decide whether we want to analyse with or without general categories, let’s see how many observations would remain if we remove all the obs for which we have a general category at 1, and all specific ingredients in that category is set to 0.
#this filters out the observations which have 1 for general category and 0s for every ingredient in that category
recipes <- recipes %>%
filter(!(if_all(all_of(vegetables_vec), ~.x == 0) & vegetable == 1)) %>%
filter(!(if_all(all_of(all_meats), ~.x == 0) & meat == 1)) %>%
filter(!(if_all(all_of(fish_vec), ~.x == 0) & fish == 1)) %>%
filter(!(if_all(all_of(seafood_vec), ~.x == 0) & seafood == 1)) %>%
filter(!(if_all(all_of(herbs_vec), ~.x == 0) & herb == 1)) %>%
filter(!(if_all(all_of(nuts_vec), ~.x == 0) & nut == 1)) %>%
filter(!(if_all(all_of(fruits_vec), ~.x == 0) & fruit == 1)) %>%
filter(!(if_all(all_of(drink_no_alcohol_vec), ~.x == 0) & drink == 1)) %>%
filter(!(if_all(all_of(cheeses_vec), ~.x == 0) & cheese == 1)) %>%
filter(!(if_all(all_of(dairy_vec), ~.x == 0) & dairy == 1)) %>%
filter(!(if_all(all_of(spices_vec), ~.x == 0) & spice == 1))
#we are left with 11382 obs after removing all recipes which have no specific ingredient in at least one category, while that category general variable is at 1
### without duplicated recipes we are left at 10321 obs3 EDA
3.1 Nutrution EDA
3.1.1 Structure and summary
# Now let's see the structure of our data
recipes %>%
head(20) %>%
str() #> 'data.frame': 20 obs. of 681 variables:
#> $ ID : int 1 2 6 9 10 11 13 14 15 16 ...
#> $ title : chr "Lentil, Apple, and Turkey Wrap "..
#> $ rating : num 2.5 4.38 4.38 4.38 3.75 ...
#> $ calories : num 426 403 948 170 602 256 766 174 1..
#> $ protein : num 30 18 19 7 23 4 12 11 4 5 ...
#> $ fat : num 7 23 79 10 41 5 48 12 3 31 ...
#> $ sodium : num 559 1439 1042 1272 1696 ...
#> $ x_cakeweek : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x_wasteless : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x22_minute_meals : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x3_ingredient_recipes : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ x30_days_of_groceries : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ advance_prep_required : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ alabama : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ alaska : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ alcoholic : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ almond : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ amaretto : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anchovy : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anise : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anniversary : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ anthony_bourdain : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ aperitif : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ appetizer : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ apple : num 1 0 0 0 0 0 0 0 0 0 ...
#> $ apple_juice : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ apricot : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ arizona : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ artichoke : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ arugula : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ asian_pear : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ asparagus : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ aspen : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ atlanta : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ australia : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ avocado : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ back_to_school : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ backyard_bbq : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bacon : num 0 0 1 0 0 0 0 0 0 0 ...
#> $ bake : num 0 1 0 0 0 0 1 0 0 0 ...
#> $ banana : num 0 0 0 0 0 0 1 0 0 0 ...
#> $ barley : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ basil : num 0 0 1 0 0 0 0 0 0 0 ...
#> $ bass : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bastille_day : num 0 1 0 0 0 0 0 0 0 0 ...
#> $ bean : num 1 0 0 0 0 0 0 0 0 0 ...
#> $ beef : num 0 0 0 1 0 0 0 0 0 0 ...
#> $ beef_rib : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beef_shank : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beef_tenderloin : num 0 0 0 0 0 0 0 1 0 0 ...
#> $ beer : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beet : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bell_pepper : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ berry : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ beverly_hills : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ birthday : num 0 0 0 0 0 0 1 0 0 0 ...
#> $ biscuit : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bitters : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blackberry : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blender : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blue_cheese : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ blueberry : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ boil : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bok_choy : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bon_appetit : num 0 1 1 0 0 1 1 1 1 0 ...
#> $ bon_app_tit : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ boston : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bourbon : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ braise : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bran : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brandy : num 0 0 0 0 0 0 0 1 0 0 ...
#> $ bread : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ breadcrumbs : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ breakfast : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brie : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brine : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brisket : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ broccoli : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ broccoli_rabe : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ broil : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brooklyn : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brown_rice : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brownie : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brunch : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ brussel_sprout : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ buffalo : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ buffet : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bulgaria : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ bulgur : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ burrito : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ butter : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ buttermilk : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ butternut_squash : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ butterscotch_caramel : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ cabbage : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ cake : num 0 0 0 0 0 0 1 0 0 0 ...
#> $ california : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ calvados : num 0 0 0 0 0 0 0 0 0 0 ...
#> $ cambridge : num 0 0 0 0 0 0 0 0 0 0 ...
#> [list output truncated]
# We have only numerical variables, but in reality just 4 variables could be considered as such. More in particular, "rating", "calories", "protein", "fat" and "sodium" could be considered numerical. The other variables should be considered categorical since they allow only for 0 or 1 values.
# Let's have a different look at the data with the summary function.
recipes %>%
select(rating, calories, protein, fat, sodium) %>%
dfSummary(style = "grid")#> Data Frame Summary
#> recipes
#> Dimensions: 10321 x 5
#> Duplicates: 4
#>
#> +----+-----------+---------------------------+----------------------+-------------+----------+---------+
#> | No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Valid | Missing |
#> +====+===========+===========================+======================+=============+==========+=========+
#> | 1 | rating | Mean (sd) : 4.1 (0.6) | 1.25 : 87 ( 0.8%) | | 10321 | 0 |
#> | | [numeric] | min < med < max: | 1.88!: 49 ( 0.5%) | | (100.0%) | (0.0%) |
#> | | | 1.2 < 4.4 < 5 | 2.50 : 276 ( 2.7%) | | | |
#> | | | IQR (CV) : 0.6 (0.2) | 3.12!: 842 ( 8.2%) | I | | |
#> | | | | 3.75 : 2956 (28.6%) | IIIII | | |
#> | | | | 4.38!: 4704 (45.6%) | IIIIIIIII | | |
#> | | | | 5.00 : 1407 (13.6%) | II | | |
#> | | | | ! rounded | | | |
#> +----+-----------+---------------------------+----------------------+-------------+----------+---------+
#> | 2 | calories | Mean (sd) : 9111 (444236) | 1689 distinct values | : | 10321 | 0 |
#> | | [numeric] | min < med < max: | | : | (100.0%) | (0.0%) |
#> | | | 0 < 358 < 30111218 | | : | | |
#> | | | IQR (CV) : 397 (48.8) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-------------+----------+---------+
#> | 3 | protein | Mean (sd) : 121 (4158) | 258 distinct values | : | 10321 | 0 |
#> | | [numeric] | min < med < max: | | : | (100.0%) | (0.0%) |
#> | | | 0 < 9 < 236489 | | : | | |
#> | | | IQR (CV) : 25 (34.3) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-------------+----------+---------+
#> | 4 | fat | Mean (sd) : 498 (25270) | 294 distinct values | : | 10321 | 0 |
#> | | [numeric] | min < med < max: | | : | (100.0%) | (0.0%) |
#> | | | 0 < 19 < 1722763 | | : | | |
#> | | | IQR (CV) : 26 (50.7) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-------------+----------+---------+
#> | 5 | sodium | Mean (sd) : 8951 (412965) | 2161 distinct values | : | 10321 | 0 |
#> | | [numeric] | min < med < max: | | : | (100.0%) | (0.0%) |
#> | | | 0 < 322 < 27675110 | | : | | |
#> | | | IQR (CV) : 652 (46.1) | | : | | |
#> | | | | | : | | |
#> +----+-----------+---------------------------+----------------------+-------------+----------+---------+
# We can already see for instance that the majority of the values of the variable "rating" are 4.38 (40% of the total). Moreover, we observe that the variables "calories", "protein", "fat" and "sodium" have roughly 20% of missing values.3.1.2 Visual exploration - Univariate Analysis
3.1.2.1 Rating Barplot
recipes %>%
ggplot(aes(x=as.factor(rating), fill=as.factor(rating) )) +
geom_bar( ) +
scale_fill_manual(values = c("red4", "red3", "orangered", "orange", "gold", "greenyellow", "green3", "green4") ) +
theme(legend.position="none") +
scale_y_continuous(breaks=seq(0,10000,1000)) +
labs(x = "Rating", y = "Count",
title = "Overview of recipes' ratings")# As we can see, most of the ratings have value above 3.75, more in particular we notice that most of the recipes are rates with 4.375.
# A zero rating indicates that the recipe has not been evaluated yet.3.1.2.2 Calories - Boxplot and Histogram
recipes_boxplot <-
recipes %>%
pivot_longer(cols = c(calories, protein, fat, sodium),
names_to = "nutrition",
values_to = "n_value")
recipes_calories <- recipes_boxplot %>%
filter(nutrition == "calories")
recipes_calories %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
ggtitle("Boxplot of calories nutritional value") +
xlab("") +
ylab("Value")# We notice that there are recipes with more than 30'000'000 calories which are clearly outliers. We must then discard those values in order to continue with a meaningful analysis. Otherwise from a visual point of view we could not extract any relevant information. If we take out these recipes, we still have roughly 30 recipes which have more than 7000 calories. For the purpose of this precise visualisation, we want to exclude them to have a better overview of the boxplot, but we don't consider those as outliers.
recipes_calories %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Pear-Cranberry Mincemeat Lattice Pie " calories 3.01e7
#> 2 "Deep-Dish Wild Blueberry Pie " calories 3.00e7
#> 3 "Apricot, Cranberry and Walnut Pie " calories 1.31e7
#> 4 "Lamb Köfte with Tarator Sauce " calories 4.52e6
#> 5 "Rice Pilaf with Lamb, Carrots, and Raisins " calories 4.16e6
#> 6 "Chocolate-Almond Pie " calories 3.36e6
#> 7 "Caramelized Apple and Pear Pie " calories 3.36e6
#> 8 "Merguez Lamb Patties with Golden Raisin Cousco~ calories 5.45e4
#> 9 "Grilled Lamb Chops with Porcini Mustard " calories 2.41e4
#> 10 "Braised Short Ribs with Red Wine Gravy " calories 1.96e4
#> # i 10,311 more rows
recipes_calories <- recipes_calories %>%
filter(n_value <= 7000)
# Calories boxplot
recipes_calories %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
scale_y_continuous(breaks=seq(0,7000,500)) +
ggtitle("Boxplot of calories nutritional value") +
xlab("") +
ylab("Calories") # Calories histogram
recipes_calories %>%
ggplot(aes(x=n_value)) +
geom_histogram(binwidth=50, fill="red3", color="red3", alpha=0.9) +
theme(plot.title = element_text(size=15)) +
scale_x_continuous(breaks=seq(0,10000,1000)) +
scale_y_continuous(breaks=seq(0,1750,250)) +
ggtitle("Distribution of calories across all recipes") +
xlab("Calories") +
ylab("Count") We can observe that most of the recipes have between 200 and 600 calories. By checking with the histogram the distribution of calories, we observe that indeed most of the recipes have less than 1000 calories.
3.1.2.3 Protein - Boxplot and Histogram
recipes_protein <- recipes_boxplot %>%
filter(nutrition == "protein")
recipes_protein %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
ggtitle("Boxplot of protein nutritional value") +
xlab("") +
ylab("Value")# We notice that there are recipes with more than 50'000 grams of protein which are clearly outliers. We must then discard those values in order to continue with a meaningful analysis. Otherwise from a visual point of view we could not extract any relevant information. By checking on the epicurious website recipes with protein values above 1000, we also verified that the amount of proteins was not justified. We came to that conclusion by evaluating the average values of protein per 100grams of each ingredient in the specific recipe.
recipes_protein %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Rice Pilaf with Lamb, Carrots, and Raisins " protein 236489
#> 2 "Pear-Cranberry Mincemeat Lattice Pie " protein 200968
#> 3 "Deep-Dish Wild Blueberry Pie " protein 200210
#> 4 "Lamb Köfte with Tarator Sauce " protein 166471
#> 5 "Apricot, Cranberry and Walnut Pie " protein 87188
#> 6 "Chocolate-Almond Pie " protein 58334
#> 7 "Caramelized Apple and Pear Pie " protein 58324
#> 8 "Merguez Lamb Patties with Golden Raisin Cousco~ protein 2074
#> 9 "Manhattan Clam Chowder " protein 1625
#> 10 "Clam and Oyster Chowder " protein 1365
#> # i 10,311 more rows
recipes_protein <- recipes_protein %>%
filter(n_value <= 1000)
# Proteins boxplot
recipes_protein %>%
ggplot( aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
scale_y_continuous(breaks=seq(0,7000,25)) +
ggtitle("Boxplot of protein nutritional value") +
xlab("") +
ylab("Proteins") # Proteins histogram
recipes_protein %>%
ggplot(aes(x=n_value)) +
geom_histogram(binwidth=7, fill="red3", color="red3", alpha=0.9) +
theme(plot.title = element_text(size=15)) +
scale_x_continuous(breaks=seq(0,1000,25)) +
scale_y_continuous(breaks=seq(0,7000,250)) +
ggtitle("Distribution of proteins across all recipes") +
xlab("Proteins") +
ylab("Count") From the boxplot, we observe that most recipes have less than 30 grams of proteins. By plotting the histogram, we verify that this information is correct. We could even extend the range to 100 proteins per recipe. We assume that recipes with values above this threshold contain ingredients like meat, tuna, salmon or shrimps.
3.1.2.4 Sodium - Boxplot and Histogram
recipes_sodium <- recipes_boxplot %>%
filter(nutrition == "sodium")
recipes_sodium %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
ggtitle("Boxplot of sodium nutritional value") +
xlab("") +
ylab("Value")# We notice that there are recipes with more than 100'000 milligrams of sodium which are clearly outliers. We must then discard those values in order to continue with a meaningful analysis. By conducting further research, we realize that sodium values above 30'000 are highly suspicious.
recipes_sodium %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Pear-Cranberry Mincemeat Lattice Pie " sodium 27675110
#> 2 "Deep-Dish Wild Blueberry Pie " sodium 27570999
#> 3 "Apricot, Cranberry and Walnut Pie " sodium 12005810
#> 4 "Lamb Köfte with Tarator Sauce " sodium 7540990
#> 5 "Chocolate-Almond Pie " sodium 3449512
#> 6 "Caramelized Apple and Pear Pie " sodium 3449373
#> 7 "Rice Pilaf with Lamb, Carrots, and Raisins " sodium 3134853
#> 8 "Whole Branzino Roasted in Salt " sodium 132220
#> 9 "Red Snapper Baked in Salt with Romesco Sauce " sodium 132025
#> 10 "Scallops with Mushrooms in White-Wine Sauce " sodium 90572
#> # i 10,311 more rows
recipes_sodium <- recipes_sodium %>%
filter(n_value <= 30000)
# Sodium boxplot
recipes_sodium %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
scale_y_continuous(breaks=seq(0,30000,500)) +
ggtitle("Boxplot of sodium nutritional value") +
xlab("") +
ylab("Sodium") # Sodium histogram
recipes_sodium %>%
ggplot(aes(x=n_value)) +
geom_histogram(binwidth=50, fill="red3", color="red3", alpha=0.9) +
theme(plot.title = element_text(size=15)) +
scale_x_continuous(breaks=seq(0,30000,1000)) +
scale_y_continuous(breaks=seq(0,1750,250)) +
ggtitle("Distribution of sodium across all recipes") +
xlab("Sodium") +
ylab("Count") From the boxplot we observe that most recipes have sodium values below 750 milligrams. The histogram informs us that most of recipes have indeed less than 750 milligrams of sodium, even though we cannot exclude the presence of a good amount of recipes with sodium between 750 and 2000 milligrams.
3.1.2.5 Fat - Boxplot and Histogram
recipes_fat <- recipes_boxplot %>%
filter(nutrition == "fat")
recipes_fat %>%
ggplot(aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
ggtitle("Boxplot of fat nutritional value") +
xlab("") +
ylab("Value")# We notice that there are recipes with more than 44'000 grams of fat which are clearly outliers. We must then discard those values in order to continue with a meaningful analysis. By checking on the epicurious website recipes with fat values above 1000, we also verified that the amount of proteins was not justified. We came to that conclusion by evaluating the average values of protein per 100grams of each ingredient in the specific recipe.
recipes_fat %>%
select(title, nutrition, n_value) %>%
arrange(desc(n_value)) #> # A tibble: 10,321 x 3
#> title nutrition n_value
#> <chr> <chr> <dbl>
#> 1 "Pear-Cranberry Mincemeat Lattice Pie " fat 1722763
#> 2 "Deep-Dish Wild Blueberry Pie " fat 1716279
#> 3 "Apricot, Cranberry and Walnut Pie " fat 747374
#> 4 "Rice Pilaf with Lamb, Carrots, and Raisins " fat 221495
#> 5 "Chocolate-Almond Pie " fat 186660
#> 6 "Caramelized Apple and Pear Pie " fat 186642
#> 7 "Lamb Köfte with Tarator Sauce " fat 44198
#> 8 "Grilled Lamb Chops with Porcini Mustard " fat 2228
#> 9 "Braised Short Ribs with Red Wine Gravy " fat 1818
#> 10 "Braised Duck Legs with Shallots and Parsnips " fat 1610
#> # i 10,311 more rows
recipes_fat <- recipes_fat %>%
filter(n_value <= 40000)
# Fat boxplot
recipes_fat %>%
ggplot( aes(x=nutrition, y=n_value, fill=nutrition)) +
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
theme_light() +
theme(legend.position="none",
plot.title = element_text(size=11)) +
scale_y_continuous(breaks=seq(0,3000,100)) +
ggtitle("Boxplot of fat nutritional value") +
xlab("") +
ylab("Fat") # Fat histogram
recipes_fat %>%
ggplot(aes(x=n_value)) +
geom_histogram(binwidth=7, fill="red3", color="red3", alpha=0.9) +
theme(plot.title = element_text(size=15)) +
ggtitle("Distribution of fat across all recipes") +
scale_x_continuous(breaks=seq(0,3000,100)) +
scale_y_continuous(breaks=seq(0,7000,250)) +
xlab("Fat") +
ylab("Count") It is hard to interpret the boxplot. There are certain recipes which could have potentially more than 1000 or even 2000 grams of fat because of the high quantity of servings and the use of ingredients such as lamb, duck and bacon. We must then analyse the histogram to have a better overview and we notice that most recipes have fat values below 100 grams.
3.1.3 Visual exploration - Multivariate Analysis
3.1.3.1 Scatterplots of Rating
recipes <- recipes %>%
filter(calories <= 7000, protein <= 1000, sodium <= 30000, fat <= 40000)
# Scatterplot of Rating-Calories
recipes %>%
ggplot(aes(x=calories, y=rating)) +
geom_point() +
ggtitle("Scatterplot of rating against calories") +
xlab("Calories") +
ylab("Rating") # We can observe that the recipes with more than 2000 calories tend to have a higher rating. For instance, few recipes with less than a 3 star rating have more than 2000 calories.
# Scatterplot of Rating-Protein
recipes %>%
ggplot(aes(x=protein, y=rating)) +
geom_point() +
ggtitle("Scatterplot of rating against proteins") +
xlab("Proteins") +
ylab("Rating") # We can observe that the recipes with more than 125 grams of proteins tend to have a higher rating. For instance, few recipes with less than a 3 star rating have more than 125 grams of proteins.
# Scatterplot of Rating-Fat
recipes %>%
ggplot(aes(x=fat, y=rating)) +
geom_point() +
ggtitle("Scatterplot of rating against fat") +
xlab("Fat") +
ylab("Rating") # We can observe that the recipes with more than 100 grams of fat tend to have a higher rating. For instance, few recipes with less than a 3 star rating have more than 100 grams of fat.
# Scatterplot of Rating-Sodium
recipes %>%
ggplot(aes(x=sodium, y=rating)) +
geom_point() +
ggtitle("Scatterplot of rating against sodium") +
xlab("Sodium") +
ylab("Rating") # We can observe that the recipes with more than 5000 milligrams of sodium tend to have a higher rating. For instance, few recipes with less than a 3 star rating have more than 5000 mg of sodium.3.1.3.2 Correlogram
corr_nutritional_values = recipes %>%
select(rating, calories, protein, fat, sodium) %>%
cor()
corrplot(corr_nutritional_values)
The previous scatterplots illuded us that there was somehow a
correlation between rating and the nutritional values. This hypothesis
has been refuted because the correlation against the rating is almost at
zero for all the nutritional values. On the other hand we notice a
strong positive correlation between calories and fat as well as between
calories and proteins.
3.1.3.3 Grouped Scatter
# We decide to plot together the variables which highlight a great level of correlation.
# Grouped scatter of calories and fat
recipes_plot1 <- recipes %>%
filter(fat <= 400, calories <= 6000)
recipes_plot1 %>%
ggplot(aes(x=calories, y=fat, color=rating)) +
geom_point() +
scale_color_gradientn(colours = rainbow(5))# Grouped scatter of calories and protein
recipes_plot2 <- recipes %>%
filter(protein <= 500, calories <= 6000)
recipes_plot2 %>%
ggplot(aes(x=calories, y=protein, color=rating)) +
geom_point() +
scale_color_gradientn(colours = rainbow(5))# Grouped scatter of protein and fat
recipes_plot3 <- recipes %>%
filter(fat <= 400, protein <= 350)
recipes_plot3 %>%
ggplot(aes(x=protein, y=fat, color=rating)) +
geom_point() +
scale_color_gradientn(colours = rainbow(5))# Grouped scatter of protein and sodium
recipes_plot4 <- recipes %>%
filter(sodium <= 400, protein <= 350)
recipes_plot4 %>%
ggplot(aes(x=protein, y=sodium, color=rating)) +
geom_point() +
scale_color_gradientn(colours = rainbow(5))3.2 Ingredients EDA
3.2.1 Feature engineering
#Creating a new dataframe with only the ID, title and the ingredients
#through the analysis above, we saw that "drinks" on it's own had only 11 observations, 4 of which also had the value "drink" = 1 --> we decided to merge the two columns to simplify working with a single category called "drink" for all drinks
ingredients_df <- recipes %>%
mutate(drink = ifelse(drink == 1 | drinks == 1, 1, 0)) %>% #merging drinks and drink
select(ID, title, all_of(all_ingredients), rating)3.2.1.1 Adding binary columns
ingredients_df_bin <- ingredients_df %>%
mutate(vegetables_bin = as.numeric(if_any(all_of(vegetables_vec), ~.x == 1, na.rm = TRUE)),
meats_bin = as.numeric(if_any(all_of(all_meats), ~.x == 1, na.rm = TRUE)),
fish_bin = as.numeric(if_any(all_of(fish_vec), ~.x == 1, na.rm = TRUE)),
seafood_bin = as.numeric(if_any(all_of(seafood_vec), ~.x == 1, na.rm = TRUE)),
herbs_bin = as.numeric(if_any(all_of(herbs_vec), ~.x == 1, na.rm = TRUE)),
nuts_bin = as.numeric(if_any(all_of(nuts_vec), ~.x == 1, na.rm = TRUE)),
fruits_bin = as.numeric(if_any(all_of(fruits_vec), ~.x == 1, na.rm = TRUE)),
#drinks_bin = as.numeric(if_any(all_of(drink_no_alcohol_vec), ~.x == 1, na.rm = TRUE)),
cheese_bin = as.numeric(if_any(all_of(cheeses_vec), ~.x == 1, na.rm = TRUE)),
dairy_bin = as.numeric(if_any(all_of(dairy_vec), ~.x == 1, na.rm = TRUE)),
spices_bin = as.numeric(if_any(all_of(spices_vec), ~.x == 1, na.rm = TRUE)),
cereals_bin = as.numeric(if_any(all_of(cereals_vec), ~.x == 1, na.rm = TRUE)),
carbs_bin = as.numeric(if_any(all_of(carbs_vec), ~.x == 1, na.rm = TRUE)),
dessert_bin = as.numeric(if_any(all_of(dessert_vec), ~.x == 1, na.rm = TRUE)),
egg_bin = (egg)
) %>%
select(ID, title, contains("bin"), everything())The fact that both select the same number of rows makes having general categories redundant in the dataset. They are not useful to create the binary columns, and they are also not useful to compute the total amount of ingredients in each category per recipe –> let’s just not include them in the first place
####testing if I still need to include the general category to create the binary column now that I modified the df to only include recipes with ingredients specified
#
# #6586
# ingredients_df %>%
# mutate(vegetables_bin = as.factor(as.numeric(if_any(c(vegetable, all_of(vegetables_vec)), ~.x == 1, na.rm = TRUE)))) %>%
# filter(vegetables_bin == 1)
#
# #6586
# ingredients_df %>%
# mutate(vegetables_bin = as.factor(as.numeric(if_any(all_of(vegetables_vec), ~.x == 1, na.rm = TRUE)))) %>%
# filter(vegetables_bin == 1)3.2.1.2 Adding total columns
ingredients_df_total <- ingredients_df %>%
mutate(total_ingredients = rowSums(select(., c(all_of(all_ingredients)))),
total_vegetables = rowSums(select(., c(all_of(vegetables_vec)))),
total_meat = rowSums(select(., c(all_of(all_meats)))),
total_fish = rowSums(select(., c(all_of(fish_vec)))),
total_seafood = rowSums(select(., c(all_of(seafood_vec)))),
total_herbs = rowSums(select(., c(all_of(herbs_vec)))),
total_nuts = rowSums(select(., c(all_of(nuts_vec)))),
total_fruits = rowSums(select(., c(all_of(fruits_vec)))),
#total_drinks = rowSums(select(., c(all_of(drink_no_alcohol_vec)))),
total_cheese = rowSums(select(., c(all_of(cheeses_vec)))),
total_dairy= rowSums(select(., c(all_of(dairy_vec)))),
total_spices= rowSums(select(., c(all_of(spices_vec)))),
total_cereals= rowSums(select(., c(all_of(cereals_vec)))),
total_carbs = rowSums(select(., c(all_of(carbs_vec)))),
total_dessert = rowSums(select(., c(all_of(dessert_vec))))
) %>%
select(ID, title, contains("total"), everything())Creating “ingredients_df_full” which contains bin columns, total columns, and original ingredients columns
total_join <- ingredients_df_total %>%
select(ID, contains("total"))
ingredients_df_full <- ingredients_df_bin %>%
left_join(total_join) %>%
select(ID, title, rating, contains("bin"), contains("total"), everything())3.2.2 Analysis part - Within ingredients
3.2.2.1 Frequency of ingredients - binary columns categories
This gives us interesting information about the frequency of each ingredient being present at least once in a recipe. As we can see, there is at least one vegetable in around 6750 recipes out of the 11380 total we have. Inversely, a very low amount of recipes contains at least one type of cereal.
#creating a vector with colnames of all the binary columns to be able to select them more easily afterwards
binary_columns <- ingredients_df_bin %>%
select(contains("bin")) %>%
colnames()
#adding binary columns to ingredients_df
total_categories <- ingredients_df_bin %>%
select(ID, all_of(binary_columns)) %>%
pivot_longer(-ID, names_to = "category", values_to = "binary_value") %>%
group_by(category) %>%
summarise(total = sum(binary_value))
#plotting the frequency of binary columns
total_categories %>%
ggplot(aes(x=reorder(category,total), y=total, fill=total)) +
geom_bar(stat = "identity") +
scale_x_discrete(guide = guide_axis(n.dodge=3))+
scale_fill_viridis() +
labs(x = "Category", y = "Amount of recipes", title = "Total amount of recipes containing at least one ingredient in defined categories")The below boxplots give us similar information about the amount of recipe which contain at least one ingredient in each category.The only category for which the an ingredient is present at least ince in more than 50% of the recipes is vegetables.
ingredients_df_bin %>%
select(contains("bin")) %>%
mutate(across(everything(), as.factor)) %>%
plot_bar(order_bar = FALSE)3.2.2.2 Correlation between binary columns, including the rating
We can see that no binary variable correlated to the rating in any way.
We see some somewhat strong negative correlation between vegetables and dessert, and between vegetable and fruits. This makes sense, as these ingredients are rarely found together in recipes. As a side note, we chose to classify tomato as a vegetable and strongly stand by this opinion :)
Concerning positive correlations, we see nuts and desert as highly correlated. This is probably because they go well together in sugary recipes. Additionally, egg and dairy is also correlated at 0.23. This most likely comes from patisserie recipes where eggs and dairy ingredients go hand in hand.
ingredients_df_bin %>%
select(contains("bin"), rating) %>%
plot_correlation()#testing if corr matrix changes when we set the binary columns as factor instead of num
ingredients_df_bin %>%
select(contains("bin"), rating) %>%
mutate(across(-rating, as.factor)) %>%
plot_correlation()#this gives weird results#now trying with rating as a factor
ingredients_df_bin %>%
select(contains("bin"), rating) %>%
mutate(across(rating, as.factor)) %>%
plot_correlation()3.2.2.3 Total individual ingredients
#Analysis which single ingredient is present in most recipes
df <- ingredients_df %>%
select(-title, -rating) %>%
pivot_longer(-ID, names_to = "ingredient", values_to = "value")
ing_top10 <- df %>%
group_by(ingredient) %>%
summarise(total = sum(value)) %>%
ungroup() %>%
arrange(desc(total)) %>%
slice(1:10)
ing_top10 %>%
# mutate(ingredient = fct_rev(ingredient)) %>%
ggplot(aes(x=reorder(ingredient, total), y=total, fill=ingredient)) +
geom_bar(stat = "identity") +
scale_fill_viridis(discrete = TRUE) +
scale_x_discrete(guide = guide_axis(n.dodge=2))+
labs(x = "Ingredient", y = "Value", title = "Total amount of recipes containing each ingredient\nTop 10")3.2.2.4 Correlation between number of ingredients and rating
ingredients_df_total %>%
select(contains("total"), rating) %>%
plot_correlation()3.2.2.5 Amount of ingredients per recipe
The number of ingredients per recipe is more or less normally distributed, with a mean around 4.75.
#checking some stuff about the new ingredients table
ingredients_df_total %>%
select(ID, title, total_ingredients) %>%
ggplot(aes(x=total_ingredients)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_ingredients)),color="red", linetype="dashed", size=1)+
scale_x_continuous(breaks = seq(1, 12, by = 1)) +
labs(x="Number of ingredients per recipe", y = "Recipe Count", title = "Distrubution of number of ingredients per recipe")#we notice that 117 (no NAs and RAT0, and not duplicated) recipes have 0 ingredients, let's investigate why and how that's possible
ingredients_df_total %>%
filter(total_ingredients==0)
#let's pick recipe ID number 1183 which should have poppy and sesame seeds according to the title
recipes %>%
filter(ID == 1183) %>%
select_if(~ any(. == 1))
#we can see that only 3 variables are equal to 1 here
recipes %>%
filter(ID == 365) %>%
select_if(~ any(. == 1))
recipes %>%
filter(ID == 1089) %>%
select_if(~ any(. == 1))
#####
#QUESTION: do we want to keep those recipes?
#####
ingredients_df_total %>%
filter(total_ingredients >10)Based on this information, we decide to eliminate those 117 observations which don’t contain any ingredients that we have classified in our vectors.
ingredients_df_full <- ingredients_df_full %>%
filter(!total_ingredients == 0)Besides the total amount of ingredients, let’s check the amount of ingredients per recipe for the top 3 categories in terms of ingredients frequency (i.e., vegetables, fruits, meats)
ingredients_df_full %>%
filter(vegetables_bin == 1) %>%
select(ID, title, total_vegetables) %>%
ggplot(aes(x=total_vegetables)) +
scale_x_continuous(breaks = seq(1, 9, by = 1)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_vegetables)),color="blue", linetype="dashed", size=1)ingredients_df_full %>%
filter(fruits_bin == 1) %>%
select(ID, title, total_fruits) %>%
ggplot(aes(x=total_fruits)) +
scale_x_continuous(breaks = seq(1, 9, by = 1)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_fruits)),color="blue", linetype="dashed", size=1)# ingredients_df_full %>%
# select(ID, title, total_meat) %>%
# ggplot(aes(x=total_meat)) +
# geom_bar() + geom_vline(aes(xintercept=mean(total_meat)),color="blue", linetype="dashed", size=1)+
# labs(x="Number of meats per recipe", y = "Recipe Count", title = "Distrubution of number of meats per recipe")
#let's try to filter by recipes which contain meat to see if my functions work
ingredients_df_full %>%
filter(meats_bin == 1) %>%
select(ID, title, total_meat) %>%
ggplot(aes(x=total_meat)) +
geom_bar() + geom_vline(aes(xintercept=mean(total_meat)),color="blue", linetype="dashed", size=1)+
labs(x="Number of meats per recipe", y = "Recipe Count", title = "Distrubution of number of meats per recipe")#why do we still have value in 0 meats --> it was because when creating the total meat column in ingredients_df_test I did not include the general meat category3.2.3 Categories relative to rating
It’s hard to interpret the results clearly with so many rating, let’s try to reduce it to 2 category –> above 4, and below 4
ingredients_df_bin %>%
select(contains("bin"), rating) %>%
mutate(across(everything(), as.factor)) %>%
plot_bar(by = "rating", order_bar = FALSE)
We not have only 2 categories: recipes with rating above 4 and recipes
with ratings below 4. There is no clear relationship in those graphs
either, and this confirms the correlation results that we have
above.
If we look at vegetables for example, we can see that the proportion of recipes with ratings above 4 is higher for recipes containing no vegetables, when compared to recipes containing at least one vegetable.
ingredients_df_bin %>%
select(contains("bin"), rating) %>%
mutate(rating_4 = ifelse(rating>4, 1, 0), across(everything(), as.factor)) %>%
select(-rating) %>%
plot_bar(by = "rating_4", order_bar = FALSE)3.3 Mixed EDA - ingredients and nutritional value
recipes_select <- recipes %>%
select(ID, title, rating, calories, protein, sodium, fat)
ingredients_select <- ingredients_df_total %>%
select(ID, all_of(contains("total")))
recipes_more <- recipes_select %>%
left_join(ingredients_select,
by=c('ID'))
ingredients_bin_select <- ingredients_df_bin %>%
select(ID, contains("bin"))
recipes_full <- recipes_more %>%
left_join(ingredients_bin_select,
by=c('ID'))3.3.1 Correlation between nutritional values and categories
recipes_full %>%
select(-ID) %>%
plot_correlation()
For instance we notice a positive correlation between proteins and
meats_bin which includes all sorts of meat. Another visible positive
correlation is the one between sodium and seafood_bin. We might also
want to investigate the relationship between calories and carbs_bin.
3.3.2 Barplot and boxplot - Meat and Proteins
# Barplot
recipes_full %>%
ggplot(aes(x = factor(meats_bin), y = protein)) +
stat_summary(fun.y = mean, geom = "bar") +
ggtitle("Average amount of proteins per recipe with and without meat") +
xlab("Presence of Meat or not") +
ylab("Protein Content in grams")# Boxplot
recipes_full %>%
filter(protein < 450) %>%
ggplot(aes(x=factor(meats_bin), y=protein, fill=factor(meats_bin))) +
geom_boxplot(alpha=0.3) +
scale_y_continuous(breaks=seq(0,7000,25)) +
xlab("Presence of Meat or not") +
ylab("Protein Content in grams") +
theme(legend.position="none")# Boxplots per different kinds of meat
recipes_general <- recipes_full %>%
select(ID) %>%
left_join(recipes,
by=c('ID'))
recipes_meat <- recipes_general %>%
select(ID, title, rating, calories, protein, fat, sodium, all_of(all_meats))
recipes_meat <- recipes_meat %>%
pivot_longer(cols=c("beef", "beef_rib", "beef_shank", "beef_tenderloin", "brisket", "ground_beef", "hamburger", "veal", "bacon", "ham", "pork", "pork_chop", "pork_rib", "pork_tenderloin", "prosciutto", "ground_lamb", "lamb", "lamb_chop", "lamb_shank", "rack_of_lamb", "chicken", "duck", "goose", "poultry", "poultry_sausage", "quail", "turkey", "meatball", "meatloaf", "rabbit", "sausage", "steak", "venison" ),
names_to='meats',
values_to='yes_or_no') %>%
filter(yes_or_no == 1)
recipes_meat %>%
filter(protein < 450) %>%
ggplot(aes(x=meats, y=protein, fill=meats)) +
geom_boxplot(alpha=0.3) +
scale_y_continuous(breaks=seq(0,7000,25)) +
coord_flip() +
ggtitle("Distribution of proteins per recipe according to different kinds of meat") +
xlab("Meats") +
ylab("Protein Content in grams") +
theme(legend.position="none") # Here we want to show which kinds of meat specifically have a high level of proteins3.3.3 Barplot and boxplot - Seafood and Sodium
# Seafood and sodium
recipes_full %>%
ggplot(aes(x = factor(seafood_bin), y = sodium)) +
stat_summary(fun.y = mean, geom = "bar") +
ggtitle("Average amount of sodium per recipe with and without seafood") +
xlab("Presence of Seafood or not") +
ylab("Sodium Content in milligrams")# Boxplot
recipes_full %>%
filter(sodium < 10000) %>%
ggplot(aes(x=factor(seafood_bin), y=sodium, fill=factor(seafood_bin))) +
geom_boxplot(alpha=0.3) +
scale_y_continuous(breaks=seq(0,30000,500)) +
xlab("Presence of Seafood or not") +
ylab("Sodium Content in milligrams") +
theme(legend.position="none")# Boxplots per different kinds of seafood
recipes_seafood <- recipes_general %>%
select(ID, title, rating, calories, protein, fat, sodium, all_of(seafood_vec))
recipes_seafood <- recipes_seafood %>%
pivot_longer(cols=c("clam", "crab", "lobster", "mussel", "octopus", "oyster", "scallop", "shellfish", "shrimp", "squid" ),
names_to='seafoods',
values_to='yes_or_no') %>%
filter(yes_or_no == 1)
recipes_seafood %>%
filter(sodium < 10000) %>%
ggplot(aes(x=seafoods, y=sodium, fill=seafoods)) +
geom_boxplot(alpha=0.3) +
scale_y_continuous(breaks=seq(0,30000,500)) +
coord_flip() +
ggtitle("Distribution of sodium per recipe according to different kinds of seafood") +
xlab("Seafood") +
ylab("Sodium Content in milligrams") +
theme(legend.position="none") # Here we want to show which kinds of seafood specifically have a high level of sodium3.3.4 Barplot and boxplot - Carbs and Calories
# Carbs and calories
recipes_full %>%
ggplot(aes(x = factor(carbs_bin), y = calories)) +
stat_summary(fun.y = mean, geom = "bar") +
ggtitle("Average amount of calories per recipe with and without carbohydrates") +
xlab("Presence of carbohydrates or not") +
ylab("Calories content")# Afterwards we would also want to show which kinds of carbs specifically have a high number of calories
# Boxplot
recipes_full %>%
ggplot(aes(x=factor(carbs_bin), y=calories, fill=factor(carbs_bin))) +
geom_boxplot(alpha=0.3) +
scale_y_continuous(breaks=seq(0,7000,500)) +
xlab("Presence of Carbs or not") +
ylab("Calories Content") +
theme(legend.position="none")# Boxplots per different kinds of carbs
recipes_carbs <- recipes_general %>%
select(ID, title, rating, calories, protein, fat, sodium, all_of(carbs_vec))
recipes_carbs <- recipes_carbs %>%
pivot_longer(cols=c("brown_rice", "chickpea", "cornmeal", "couscous", "hominy_cornmeal_masa", "orzo", "pasta", "potato", "rice", "semolina", "sweet_potato_yam", "wild_rice"),
names_to='carbs',
values_to='yes_or_no') %>%
filter(yes_or_no == 1)
recipes_carbs %>%
filter(sodium < 10000) %>%
ggplot(aes(x=carbs, y=calories, fill=carbs)) +
geom_boxplot(alpha=0.3) +
scale_y_continuous(breaks=seq(0,7000,500)) +
coord_flip() +
ggtitle("Distribution of calories per recipe according to different kinds of food high in carbohydrates") +
xlab("Carbs") +
ylab("Calories Content") +
theme(legend.position="none") # Here we want to show which kinds of food high in carbs specifically have a high level of calories3.4 Seasons and Recipe Type EDA
#TO ADD WHEN COMPLETEDQuestions - should we convert the binary columns to factor or can we leave them as integer for modelling? - should we balance the data, in the rating_bin case and in the rating normal with 7 classes - should we normalise the numerical data - it’s mentionned in the slides that the validation set should not be balanced, but how do we do that using train() with caret? - should we really used balanced data for training? Because at least for KNN it always makes K=1 better, whereas K is was larger when we trained with unbalanced data Apparently for KNN, it’s not required to balance data